data <- read.csv('HR_Employee_Data.csv')
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## Warning: package 'readr' was built under R version 4.1.1
## Warning: package 'stringr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.1
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(dplyr)
library(plotly)
library(hrbrthemes)
## Warning: package 'hrbrthemes' was built under R version 4.1.3
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(highcharter)
## Warning: package 'highcharter' was built under R version 4.1.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.1
library(caret)
## Warning: package 'caret' was built under R version 4.1.1
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.1.3
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
library(plotly)
library(corrly)
library(ecodist)
## Warning: package 'ecodist' was built under R version 4.1.3
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.1.3
## naivebayes 0.9.7 loaded
library(psych)
## Warning: package 'psych' was built under R version 4.1.3
##
## Attaching package: 'psych'
## The following object is masked from 'package:ecodist':
##
## distance
## The following object is masked from 'package:kernlab':
##
## alpha
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
## Loading required package: rpart
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.1.3
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.1.3
library(class)
#install.packages("ggcorrplot")
#install.packages("ROCR")
#install.packages("rpart.plot")
#install.packages('ecodist')
#install.packages("remotes")
#remotes::install_github("kmaheshkulkarni/corrly")
#install.packages("hrbrthemes")
#install.packages("highcharter")
#install.packages("kernlab")
#install.packages("caTools")
head(data)
## ï..Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438 38% 53% 2
## 2 IND28133 80% 86% 5
## 3 IND07164 11% 88% 7
## 4 IND30478 72% 87% 5
## 5 IND24003 37% 52% 2
## 6 IND08609 41% 50% 2
## average_montly_hours time_spend_company Work_accident left
## 1 157 3 0 1
## 2 262 6 0 1
## 3 272 4 0 1
## 4 223 5 0 1
## 5 159 3 0 1
## 6 153 3 0 1
## promotion_last_5years Department salary
## 1 0 sales low
## 2 0 sales medium
## 3 0 sales medium
## 4 0 sales low
## 5 0 sales low
## 6 0 sales low
str(data)
## 'data.frame': 14999 obs. of 11 variables:
## $ ï..Emp_Id : chr "IND02438" "IND28133" "IND07164" "IND30478" ...
## $ satisfaction_level : chr "38%" "80%" "11%" "72%" ...
## $ last_evaluation : chr "53%" "86%" "88%" "87%" ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Department : chr "sales" "sales" "sales" "sales" ...
## $ salary : chr "low" "medium" "medium" "low" ...
#glimpse(data)
summary(data)
## ï..Emp_Id satisfaction_level last_evaluation number_project
## Length:14999 Length:14999 Length:14999 Min. :2.000
## Class :character Class :character Class :character 1st Qu.:3.000
## Mode :character Mode :character Mode :character Median :4.000
## Mean :3.803
## 3rd Qu.:5.000
## Max. :7.000
## average_montly_hours time_spend_company Work_accident left
## Min. : 96.0 Min. : 2.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:156.0 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :200.0 Median : 3.000 Median :0.0000 Median :0.0000
## Mean :201.1 Mean : 3.498 Mean :0.1446 Mean :0.2381
## 3rd Qu.:245.0 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :310.0 Max. :10.000 Max. :1.0000 Max. :1.0000
## promotion_last_5years Department salary
## Min. :0.00000 Length:14999 Length:14999
## 1st Qu.:0.00000 Class :character Class :character
## Median :0.00000 Mode :character Mode :character
## Mean :0.02127
## 3rd Qu.:0.00000
## Max. :1.00000
cbind(lapply(lapply(data, is.na), sum))
## [,1]
## ï..Emp_Id 0
## satisfaction_level 0
## last_evaluation 0
## number_project 0
## average_montly_hours 0
## time_spend_company 0
## Work_accident 0
## left 0
## promotion_last_5years 0
## Department 0
## salary 0
sum(is.na(data))
## [1] 0
data$satisfaction_level<-gsub("%","",as.character(data$satisfaction_level))
data$satisfaction_level=as.integer(data$satisfaction_level)
head(data)
## ï..Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438 38 53% 2
## 2 IND28133 80 86% 5
## 3 IND07164 11 88% 7
## 4 IND30478 72 87% 5
## 5 IND24003 37 52% 2
## 6 IND08609 41 50% 2
## average_montly_hours time_spend_company Work_accident left
## 1 157 3 0 1
## 2 262 6 0 1
## 3 272 4 0 1
## 4 223 5 0 1
## 5 159 3 0 1
## 6 153 3 0 1
## promotion_last_5years Department salary
## 1 0 sales low
## 2 0 sales medium
## 3 0 sales medium
## 4 0 sales low
## 5 0 sales low
## 6 0 sales low
data$last_evaluation<-gsub("%","",as.character(data$last_evaluation))
data$last_evaluation=as.integer(data$last_evaluation)
head(data)
## ï..Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438 38 53 2
## 2 IND28133 80 86 5
## 3 IND07164 11 88 7
## 4 IND30478 72 87 5
## 5 IND24003 37 52 2
## 6 IND08609 41 50 2
## average_montly_hours time_spend_company Work_accident left
## 1 157 3 0 1
## 2 262 6 0 1
## 3 272 4 0 1
## 4 223 5 0 1
## 5 159 3 0 1
## 6 153 3 0 1
## promotion_last_5years Department salary
## 1 0 sales low
## 2 0 sales medium
## 3 0 sales medium
## 4 0 sales low
## 5 0 sales low
## 6 0 sales low
#Renaming the Column name in Dataframe
data <- data %>%
rename(Emp_Id=ï..Emp_Id )
head(data)
## Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438 38 53 2
## 2 IND28133 80 86 5
## 3 IND07164 11 88 7
## 4 IND30478 72 87 5
## 5 IND24003 37 52 2
## 6 IND08609 41 50 2
## average_montly_hours time_spend_company Work_accident left
## 1 157 3 0 1
## 2 262 6 0 1
## 3 272 4 0 1
## 4 223 5 0 1
## 5 159 3 0 1
## 6 153 3 0 1
## promotion_last_5years Department salary
## 1 0 sales low
## 2 0 sales medium
## 3 0 sales medium
## 4 0 sales low
## 5 0 sales low
## 6 0 sales low
corr <- round(cor(data[2:9]), 1)
ggcorrplot(corr, lab = TRUE)
ans=crosstab(data$Department,data$left)
Department=rownames(ans)
fig <- plot_ly(ans,x = ~Department, y = ~X0, type = 'bar', name = 'Working for Company')
fig<- fig %>% add_trace(y =~X1, name = 'Left the Company')
fig <- fig %>% layout(title="Employees who have left based on department", yaxis = list(title = 'Count'), barmode = 'group')
fig
ans=crosstab(data$salary,data$left)
Salary=rownames(ans)
fig <- plot_ly(ans,x = ~Salary, y = ~X0, type = 'bar', name = 'Working for Company')
fig<- fig %>% add_trace(y =~X1, name = 'Left the Company')
fig <- fig %>% layout(title="Employees who have left based on salary", yaxis = list(title = 'Count'), barmode = 'group')
fig
ans=crosstab(data$time_spend_company,data$left)
ans
## X0 X1
## 2 3191 53
## 3 4857 1586
## 4 1667 890
## 5 640 833
## 6 509 209
## 7 188 0
## 8 162 0
## 10 214 0
Time_Spent=rownames(ans)
Time_Spent
## [1] "2" "3" "4" "5" "6" "7" "8" "10"
fig <- plot_ly(ans,x = ~Time_Spent, y = ~X0, type = 'scatter', mode = 'lines', name = 'Working for Company', fill = 'tozeroy')
fig <- fig %>% add_trace(y = ~X1, name = 'Left the Company', fill = 'tozeroy')
fig <- fig %>% layout(xaxis = list(title = 'Time Worked'),
yaxis = list(title = 'Count'))
fig
df <- data
head(df)
## Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438 38 53 2
## 2 IND28133 80 86 5
## 3 IND07164 11 88 7
## 4 IND30478 72 87 5
## 5 IND24003 37 52 2
## 6 IND08609 41 50 2
## average_montly_hours time_spend_company Work_accident left
## 1 157 3 0 1
## 2 262 6 0 1
## 3 272 4 0 1
## 4 223 5 0 1
## 5 159 3 0 1
## 6 153 3 0 1
## promotion_last_5years Department salary
## 1 0 sales low
## 2 0 sales medium
## 3 0 sales medium
## 4 0 sales low
## 5 0 sales low
## 6 0 sales low
str(df)
## 'data.frame': 14999 obs. of 11 variables:
## $ Emp_Id : chr "IND02438" "IND28133" "IND07164" "IND30478" ...
## $ satisfaction_level : int 38 80 11 72 37 41 10 92 89 42 ...
## $ last_evaluation : int 53 86 88 87 52 50 77 85 100 53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Department : chr "sales" "sales" "sales" "sales" ...
## $ salary : chr "low" "medium" "medium" "low" ...
data1 <- df%>%
group_by(Department)%>%
summarize(Avg_hrs = mean(average_montly_hours))
fig <- plot_ly(data1, x = ~Department, y = ~Avg_hrs, type = 'bar', color = I("dark blue"))
fig <- fig %>% layout(title = "Average monthly working hours according to department",
xaxis = list(title = "Department"),
yaxis = list(title = "Average monthly working hours"))
fig
#ggplot(df,aes(x=number_project,y=average_montly_hours))+geom_jitter(aes(color=Department))
data2 <- df%>%
filter(Work_accident==1)%>%
group_by(Department)%>%
summarize(No_of_wa = n())%>%
arrange(No_of_wa)
head(data2)
## # A tibble: 6 x 2
## Department No_of_wa
## <chr> <int>
## 1 hr 89
## 2 accounting 96
## 3 management 103
## 4 product_mng 132
## 5 RandD 134
## 6 marketing 138
hc <- data2 %>%
hchart('line', hcaes(x = Department, y = No_of_wa))%>%
hc_title(text = "Number of work accidents for each department")%>%
hc_yAxis(title = "Number of work accidents")
hc
l <- df %>% filter(salary == "low")
m <- df %>% filter(salary == "medium")
h <- df %>% filter(salary == "high")
hc2 <- hchart(
density(l$satisfaction_level), type = "area",
color = "steelblue", name = "Low Salary"
) %>%
hc_add_series(
density(m$satisfaction_level), type = "area",
color = "#B71C1C",
name = "Medium Salary"
)%>%
hc_add_series(
density(h$satisfaction_level), type = "area",
color = "yellow",
name = "High Salary"
)%>%
hc_title(text = "Density plot of satisfaction level according to salary")%>%
hc_xAxis(title = "Satisfaction Level (0-100)")
hc2
fig <- plot_ly(df, labels = ~Department, values = ~time_spend_company, type = 'pie')
fig <- fig %>% layout(title = 'Time spent per Department',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig
p <- ggplot(data, aes(x =satisfaction_level))+
geom_bar(color="darkblue", fill="lightblue")+
ggtitle("Distribution of Satisfaction level") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
p
fig <- plot_ly(data,x=~Department, y=~number_project,color = ~salary,type="bar")
fig
data %>%
ggplot(aes(x=satisfaction_level,y=average_montly_hours))+
geom_boxplot(fill="lightblue")+
xlab("satisfaction_level")+
ylab("average_montly_hours")+
facet_grid(~salary)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
fig <- plot_ly(data,labels = ~salary, values = ~time_spend_company,type="pie", textinfo='label+percent')
fig
fig <- plot_ly(data, x = ~promotion_last_5years, y = ~number_project ,type = 'bar', color=~Department)
fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
set.seed(234)
dataDT=data[2:11]
smpl<-sample(2,nrow(dataDT),replace=T,prob=c(0.8,0.2))
train<-dataDT[smpl==1,]
test<-dataDT[smpl==2, ]
fit <- rpart(left~., data = train, method = 'class')
rpart.plot(fit, extra = 106)
predict_unseen <-predict(fit, test, type = 'class')
table_mat <- table(test$left, predict_unseen)
table_mat
## predict_unseen
## 0 1
## 0 2270 26
## 1 58 647
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
## [1] 0.9720093
data$left<-as.factor(data$left)
#data$rank<-as.factor(data$rank)
str(data)
## 'data.frame': 14999 obs. of 11 variables:
## $ Emp_Id : chr "IND02438" "IND28133" "IND07164" "IND30478" ...
## $ satisfaction_level : int 38 80 11 72 37 41 10 92 89 42 ...
## $ last_evaluation : int 53 86 88 87 52 50 77 85 100 53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Department : chr "sales" "sales" "sales" "sales" ...
## $ salary : chr "low" "medium" "medium" "low" ...
dataNB<-data[2:11]
#pairs.panels(data)
#cor(data$gre,data$gpa)
set.seed(234)
smpl<-sample(2,nrow(dataNB),replace=T,prob=c(0.8,0.2))
train<-dataNB[smpl==1,]
test<-dataNB[smpl==2, ]
mdl<-naive_bayes(left~ .,data=train)
#mdl
#plot(mdl)
p<-predict(mdl,train,type='prob')
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
head(cbind(p,train))
## 0 1 satisfaction_level last_evaluation number_project
## 1 0.147803194 0.8521968 38 53 2
## 2 0.734647017 0.2653530 80 86 5
## 3 0.001099585 0.9989004 11 88 7
## 4 0.486081254 0.5139187 72 87 5
## 5 0.139957478 0.8600425 37 52 2
## 6 0.154630169 0.8453698 41 50 2
## average_montly_hours time_spend_company Work_accident left
## 1 157 3 0 1
## 2 262 6 0 1
## 3 272 4 0 1
## 4 223 5 0 1
## 5 159 3 0 1
## 6 153 3 0 1
## promotion_last_5years Department salary
## 1 0 sales low
## 2 0 sales medium
## 3 0 sales medium
## 4 0 sales low
## 5 0 sales low
## 6 0 sales low
p1<-predict(mdl,train)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab1<-table(p1,train$left))
##
## p1 0 1
## 0 7369 739
## 1 1763 2127
accuracy=sum(diag(tab1))/sum(tab1)
accuracy
## [1] 0.7914652
df2 <- df
df2$left <- as.factor(df2$left)
str(df2)
## 'data.frame': 14999 obs. of 11 variables:
## $ Emp_Id : chr "IND02438" "IND28133" "IND07164" "IND30478" ...
## $ satisfaction_level : int 38 80 11 72 37 41 10 92 89 42 ...
## $ last_evaluation : int 53 86 88 87 52 50 77 85 100 53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Department : chr "sales" "sales" "sales" "sales" ...
## $ salary : chr "low" "medium" "medium" "low" ...
df2$Emp_Id <- NULL
set.seed(234)
split <- sample.split(df2, SplitRatio = 0.7)
split
## [1] TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
train <- subset(df2, split == "TRUE")
test <- subset(df2, split == "FALSE")
classifier = svm(formula = left ~ .,
data = train,
type = 'C-classification',
kernel = 'linear')
y_pred = predict(classifier, newdata = test[-7])
y_train_pred = predict(classifier, newdata = train[-7])
cm = table(test[, 7], y_pred)
cm
## y_pred
## 0 1
## 0 3208 220
## 1 792 280
cm2 = table(train[, 7], y_train_pred )
cm2
## y_train_pred
## 0 1
## 0 7525 475
## 1 1864 635
sum(diag(cm))/sum(cm)
## [1] 0.7751111
# Splitting data in train and test data
# Fitting Random Forest to the train dataset
set.seed(120) # Setting seed
classifier_RF = randomForest(x = train[-7],
y = train$left,
ntree = 50)
classifier_RF
##
## Call:
## randomForest(x = train[-7], y = train$left, ntree = 50)
## Type of random forest: classification
## Number of trees: 50
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 1.01%
## Confusion matrix:
## 0 1 class.error
## 0 7986 14 0.00175000
## 1 92 2407 0.03681473
# Predicting the Test set results
y_pred = predict(classifier_RF, newdata = test[-7])
# Confusion Matrix
confusion_mtx = table(test[, 7], y_pred)
confusion_mtx
## y_pred
## 0 1
## 0 3422 6
## 1 44 1028
# Plotting model
plot(classifier_RF)
# Importance plot
importance(classifier_RF)
## MeanDecreaseGini
## satisfaction_level 1354.816274
## last_evaluation 405.770829
## number_project 678.604828
## average_montly_hours 552.729814
## time_spend_company 702.646083
## Work_accident 18.387495
## promotion_last_5years 2.336715
## Department 40.308236
## salary 31.676829
# Variable importance plot
varImpPlot(classifier_RF)
sum(diag(confusion_mtx))/sum(confusion_mtx)
## [1] 0.9888889
# Loading package
set.seed(234)
# Splitting dataset
split <- sample.split(data, SplitRatio = 0.8)
split
## [1] TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE
train<- subset(data[2:11], split == "TRUE")
test<- subset(data[2:11], split == "FALSE")
# Training model
logistic_model <- glm(left ~ satisfaction_level + last_evaluation, data = train, family = "binomial")
logistic_model
##
## Call: glm(formula = left ~ satisfaction_level + last_evaluation, family = "binomial",
## data = train)
##
## Coefficients:
## (Intercept) satisfaction_level last_evaluation
## 0.603535 -0.038676 0.005413
##
## Degrees of Freedom: 10908 Total (i.e. Null); 10906 Residual
## Null Deviance: 11980
## Residual Deviance: 10300 AIC: 10310
# Summary
summary(logistic_model)
##
## Call:
## glm(formula = left ~ satisfaction_level + last_evaluation, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4646 -0.7053 -0.5003 -0.3326 2.3030
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.603535 0.111930 5.392 6.96e-08 ***
## satisfaction_level -0.038676 0.001027 -37.669 < 2e-16 ***
## last_evaluation 0.005413 0.001409 3.841 0.000123 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 11977 on 10908 degrees of freedom
## Residual deviance: 10300 on 10906 degrees of freedom
## AIC: 10306
##
## Number of Fisher Scoring iterations: 4
# Predict test data based on model
predict<- predict(logistic_model,test, type = "response")
#predict
# Changing probabilities
predict<- ifelse(predict>0.5, 1, 0)
# Evaluating model accuracy
# using confusion matrix
table(test$left, predict)
## predict
## 0 1
## 0 2910 207
## 1 729 244
missing_classerr <- mean(predict != test$left)
print(paste('Accuracy =', 1 - missing_classerr))
## [1] "Accuracy = 0.771149144254279"
# ROC-AUC Curve
ROCPred <- prediction(predict, test$left)
ROCPer <- performance(ROCPred, measure = "tpr", x.measure = "fpr")
auc <- performance(ROCPred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.5921804
# Plotting curve
plot(ROCPer)
plot(ROCPer, colorize = TRUE,print.cutoffs.at = seq(0.1, by = 0.1),main = "ROC CURVE")
abline(a = 0, b = 1)
auc <- round(auc, 4)
legend(.6, .4, auc, title = "AUC", cex = 1)
f=data$left
data = subset(data, select = -c(left) )
data$left <- f
data_df<-data
data$Department= as.numeric(as.factor(data$Department))
data$salary= as.numeric(as.factor(data$salary))
head(data_df)
## Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438 38 53 2
## 2 IND28133 80 86 5
## 3 IND07164 11 88 7
## 4 IND30478 72 87 5
## 5 IND24003 37 52 2
## 6 IND08609 41 50 2
## average_montly_hours time_spend_company Work_accident promotion_last_5years
## 1 157 3 0 0
## 2 262 6 0 0
## 3 272 4 0 0
## 4 223 5 0 0
## 5 159 3 0 0
## 6 153 3 0 0
## Department salary left
## 1 sales low 1
## 2 sales medium 1
## 3 sales medium 1
## 4 sales low 1
## 5 sales low 1
## 6 sales low 1
set.seed(123)
split <- sample.split(data, SplitRatio = 0.7)
train_cl <- subset(data, split == "TRUE")
test_cl <- subset(data, split == "FALSE")
# Feature Scaling
train<- scale(train_cl[, 2:10])
test<- scale(test_cl[, 2:10])
# Fitting KNN Model
# to training dataset
# K = 3
classifier_knn <- knn(train = train,
test = test,
cl = train_cl$left,
k = 3)
# Confusiin Matrix
cm <- table(test_cl$left, classifier_knn)
cm
## classifier_knn
## 0 1
## 0 3973 182
## 1 92 1207
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn != test_cl$left)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.94976164283095"
#head(data)
satisfaction_level<-57
last_evaluation<-34
number_project<-3
average_montly_hours<-130
time_spend_company<-3
Work_accident<-0
promotion_last_5years<-1
s1<-levels(factor(data_df$Department))
s1
## [1] "accounting" "hr" "IT" "management" "marketing"
## [6] "product_mng" "RandD" "sales" "support" "technical"
s2<-as.numeric(levels(factor(data$Department)))
s2
## [1] 1 2 3 4 5 6 7 8 9 10
Department<-"IT"
p<-as.numeric(match(Department,s1))
Department=s2[p]
Department
## [1] 3
a1<-levels(factor(data_df$salary))
a1
## [1] "high" "low" "medium"
a2<-as.numeric(levels(factor(data$salary)))
a2
## [1] 1 2 3
salary<-"medium"
p1<-as.numeric(match(salary,a1))
salary=a2[p1]
salary
## [1] 3
l1<-levels(factor(data_df$left))
l1
## [1] "0" "1"
l2<-as.numeric(levels(factor(data$left)))
l2
## [1] 0 1
x=c(satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,Department,salary)
classifier_knn <- knn(train = train,
test = test,
cl = train_cl$left,
k = 3)
z<-knn(train=train_cl[,2:10],test = x,cl = train_cl$left, k = 3)
z1<-as.numeric(match(z,l2))
cat("Left :",l2[z1])
## Left : 0